#coding:utf-8
'''
Loading the dataset: 从网络上下载好数据，保存到本地目录
'''
from datasets import load_dataset
from datasets import load_from_disk

'''
数据下载
'''
# 1.从网络加载
# datasets = load_dataset('seamew/ChnSentiCorp')
# 默认下载路径：/root/.cache/huggingface/datasets/seamew___chn_senti_corp
# 存储格式：DatasetDict

# 2.从本地磁盘加载数据
datasets = load_from_disk('./data/ChnSentiCorp')

'''
检查数据集
'''
#查看数据集信息
print(datasets)
#train, validation, test

# 训练数据
squad_train = datasets['train']

print("第1条数据: ")
print(squad_train[0])

#print("Two examples from the dataset using slice operation: \n")
#print(squad_train[14:16])

print('\n'+"特征: ")
print(squad_train.features)

print("Column names: ", squad_train.column_names)
print("Number of rows: ", squad_train.num_rows)
print("Number of columns: ", squad_train.num_columns)
print("Shape: ", squad_train.shape)

'''
# IPython展示
import random
import pandas as pd
from IPython.display import display, HTML
def display_random_examples(dataset=squad_train, num_examples=5):
    assert num_examples < len(dataset)
    
    random_picks = []
    for i in range(num_examples):
        random_pick = random.randint(0,len(dataset)-1)
        random_picks.append(random_pick)
    
    df = pd.DataFrame(dataset[random_picks])
    display(HTML(df.to_html()))
        
display_random_examples(squad_train, 3)
'''
